{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy\n",
    "\n",
    "from pydqc import infer_schema, data_summary, data_compare, data_consist\n",
    "\n",
    "#import sys\n",
    "#sys.path.insert(0, '../pydqc')\n",
    "#import infer_schema, data_summary, data_compare, data_consist"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# load data\n",
    "Test data is from Kaggle competition: **Zillow Prize: Zillow’s Home Value Prediction (Zestimate)**  \n",
    "link: https://www.kaggle.com/c/zillow-prize-1  \n",
    "\n",
    "You need to download the following files, unzip and put them into the 'data' folder.  \n",
    "1. properties_2016.csv.zip  \n",
    "2. properties_2017.csv.zip  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wall time: 31.1 s\n",
      "Wall time: 38.2 s\n"
     ]
    }
   ],
   "source": [
    "%time data_2016 = pd.read_csv('data/properties_2016.csv')\n",
    "%time data_2017 = pd.read_csv('data/properties_2017.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(2985217, 58)\n",
      "(2985217, 58)\n"
     ]
    }
   ],
   "source": [
    "print data_2016.shape\n",
    "print data_2017.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>parcelid</th>\n",
       "      <th>airconditioningtypeid</th>\n",
       "      <th>architecturalstyletypeid</th>\n",
       "      <th>basementsqft</th>\n",
       "      <th>bathroomcnt</th>\n",
       "      <th>bedroomcnt</th>\n",
       "      <th>buildingclasstypeid</th>\n",
       "      <th>buildingqualitytypeid</th>\n",
       "      <th>calculatedbathnbr</th>\n",
       "      <th>decktypeid</th>\n",
       "      <th>...</th>\n",
       "      <th>numberofstories</th>\n",
       "      <th>fireplaceflag</th>\n",
       "      <th>structuretaxvaluedollarcnt</th>\n",
       "      <th>taxvaluedollarcnt</th>\n",
       "      <th>assessmentyear</th>\n",
       "      <th>landtaxvaluedollarcnt</th>\n",
       "      <th>taxamount</th>\n",
       "      <th>taxdelinquencyflag</th>\n",
       "      <th>taxdelinquencyyear</th>\n",
       "      <th>censustractandblock</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>10754147</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>9.0</td>\n",
       "      <td>2015.0</td>\n",
       "      <td>9.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>10759547</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>27516.0</td>\n",
       "      <td>2015.0</td>\n",
       "      <td>27516.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>10843547</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>650756.0</td>\n",
       "      <td>1413387.0</td>\n",
       "      <td>2015.0</td>\n",
       "      <td>762631.0</td>\n",
       "      <td>20800.37</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>10859147</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>7.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>1.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>571346.0</td>\n",
       "      <td>1156834.0</td>\n",
       "      <td>2015.0</td>\n",
       "      <td>585488.0</td>\n",
       "      <td>14557.57</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>10879947</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>193796.0</td>\n",
       "      <td>433491.0</td>\n",
       "      <td>2015.0</td>\n",
       "      <td>239695.0</td>\n",
       "      <td>5725.17</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 58 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   parcelid  airconditioningtypeid  architecturalstyletypeid  basementsqft  \\\n",
       "0  10754147                    NaN                       NaN           NaN   \n",
       "1  10759547                    NaN                       NaN           NaN   \n",
       "2  10843547                    NaN                       NaN           NaN   \n",
       "3  10859147                    NaN                       NaN           NaN   \n",
       "4  10879947                    NaN                       NaN           NaN   \n",
       "\n",
       "   bathroomcnt  bedroomcnt  buildingclasstypeid  buildingqualitytypeid  \\\n",
       "0          0.0         0.0                  NaN                    NaN   \n",
       "1          0.0         0.0                  NaN                    NaN   \n",
       "2          0.0         0.0                  NaN                    NaN   \n",
       "3          0.0         0.0                  3.0                    7.0   \n",
       "4          0.0         0.0                  4.0                    NaN   \n",
       "\n",
       "   calculatedbathnbr  decktypeid         ...           numberofstories  \\\n",
       "0                NaN         NaN         ...                       NaN   \n",
       "1                NaN         NaN         ...                       NaN   \n",
       "2                NaN         NaN         ...                       NaN   \n",
       "3                NaN         NaN         ...                       1.0   \n",
       "4                NaN         NaN         ...                       NaN   \n",
       "\n",
       "   fireplaceflag  structuretaxvaluedollarcnt  taxvaluedollarcnt  \\\n",
       "0            NaN                         NaN                9.0   \n",
       "1            NaN                         NaN            27516.0   \n",
       "2            NaN                    650756.0          1413387.0   \n",
       "3            NaN                    571346.0          1156834.0   \n",
       "4            NaN                    193796.0           433491.0   \n",
       "\n",
       "   assessmentyear  landtaxvaluedollarcnt  taxamount  taxdelinquencyflag  \\\n",
       "0          2015.0                    9.0        NaN                 NaN   \n",
       "1          2015.0                27516.0        NaN                 NaN   \n",
       "2          2015.0               762631.0   20800.37                 NaN   \n",
       "3          2015.0               585488.0   14557.57                 NaN   \n",
       "4          2015.0               239695.0    5725.17                 NaN   \n",
       "\n",
       "   taxdelinquencyyear  censustractandblock  \n",
       "0                 NaN                  NaN  \n",
       "1                 NaN                  NaN  \n",
       "2                 NaN                  NaN  \n",
       "3                 NaN                  NaN  \n",
       "4                 NaN                  NaN  \n",
       "\n",
       "[5 rows x 58 columns]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_2016.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>parcelid</th>\n",
       "      <th>airconditioningtypeid</th>\n",
       "      <th>architecturalstyletypeid</th>\n",
       "      <th>basementsqft</th>\n",
       "      <th>bathroomcnt</th>\n",
       "      <th>bedroomcnt</th>\n",
       "      <th>buildingclasstypeid</th>\n",
       "      <th>buildingqualitytypeid</th>\n",
       "      <th>calculatedbathnbr</th>\n",
       "      <th>decktypeid</th>\n",
       "      <th>...</th>\n",
       "      <th>numberofstories</th>\n",
       "      <th>fireplaceflag</th>\n",
       "      <th>structuretaxvaluedollarcnt</th>\n",
       "      <th>taxvaluedollarcnt</th>\n",
       "      <th>assessmentyear</th>\n",
       "      <th>landtaxvaluedollarcnt</th>\n",
       "      <th>taxamount</th>\n",
       "      <th>taxdelinquencyflag</th>\n",
       "      <th>taxdelinquencyyear</th>\n",
       "      <th>censustractandblock</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>10754147</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>9.0</td>\n",
       "      <td>2016.0</td>\n",
       "      <td>9.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>10759547</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>27516.0</td>\n",
       "      <td>2015.0</td>\n",
       "      <td>27516.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>10843547</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>5.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>1.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>660680.0</td>\n",
       "      <td>1434941.0</td>\n",
       "      <td>2016.0</td>\n",
       "      <td>774261.0</td>\n",
       "      <td>20800.37</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>10859147</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>6.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>1.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>580059.0</td>\n",
       "      <td>1174475.0</td>\n",
       "      <td>2016.0</td>\n",
       "      <td>594416.0</td>\n",
       "      <td>14557.57</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>10879947</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>1.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>196751.0</td>\n",
       "      <td>440101.0</td>\n",
       "      <td>2016.0</td>\n",
       "      <td>243350.0</td>\n",
       "      <td>5725.17</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 58 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   parcelid  airconditioningtypeid  architecturalstyletypeid  basementsqft  \\\n",
       "0  10754147                    NaN                       NaN           NaN   \n",
       "1  10759547                    NaN                       NaN           NaN   \n",
       "2  10843547                    NaN                       NaN           NaN   \n",
       "3  10859147                    NaN                       NaN           NaN   \n",
       "4  10879947                    NaN                       NaN           NaN   \n",
       "\n",
       "   bathroomcnt  bedroomcnt  buildingclasstypeid  buildingqualitytypeid  \\\n",
       "0          0.0         0.0                  NaN                    NaN   \n",
       "1          0.0         0.0                  NaN                    NaN   \n",
       "2          0.0         0.0                  5.0                    NaN   \n",
       "3          0.0         0.0                  3.0                    6.0   \n",
       "4          0.0         0.0                  4.0                    NaN   \n",
       "\n",
       "   calculatedbathnbr  decktypeid         ...           numberofstories  \\\n",
       "0                NaN         NaN         ...                       NaN   \n",
       "1                NaN         NaN         ...                       NaN   \n",
       "2                NaN         NaN         ...                       1.0   \n",
       "3                NaN         NaN         ...                       1.0   \n",
       "4                NaN         NaN         ...                       1.0   \n",
       "\n",
       "   fireplaceflag  structuretaxvaluedollarcnt  taxvaluedollarcnt  \\\n",
       "0            NaN                         NaN                9.0   \n",
       "1            NaN                         NaN            27516.0   \n",
       "2            NaN                    660680.0          1434941.0   \n",
       "3            NaN                    580059.0          1174475.0   \n",
       "4            NaN                    196751.0           440101.0   \n",
       "\n",
       "   assessmentyear  landtaxvaluedollarcnt  taxamount  taxdelinquencyflag  \\\n",
       "0          2016.0                    9.0        NaN                 NaN   \n",
       "1          2015.0                27516.0        NaN                 NaN   \n",
       "2          2016.0               774261.0   20800.37                 NaN   \n",
       "3          2016.0               594416.0   14557.57                 NaN   \n",
       "4          2016.0               243350.0    5725.17                 NaN   \n",
       "\n",
       "   taxdelinquencyyear  censustractandblock  \n",
       "0                 NaN                  NaN  \n",
       "1                 NaN                  NaN  \n",
       "2                 NaN                  NaN  \n",
       "3                 NaN                  NaN  \n",
       "4                 NaN                  NaN  \n",
       "\n",
       "[5 rows x 58 columns]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_2017.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# infer schema"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### infer schema based on full set of data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wall time: 4min 56s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "infer_schema.infer_schema(data=data_2016, fname='properties_2016', output_root='output/', \n",
    "                          sample_size=1.0, type_threshold=0.5, n_jobs=2, base_schema=None)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### infer schema based on sample data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wall time: 53.7 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "infer_schema.infer_schema(data=data_2016, fname='properties_2016_sample', output_root='output/', \n",
    "                          sample_size=0.1, type_threshold=0.5, n_jobs=2, base_schema=None)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### you should check the generated data schema \n",
    "and modify the data types if necessary. :)  \n",
    "You can do the modification based on the data dictionary **zillow_data_dictionary.xlsx**.  \n",
    "It's better to save the modified data schema with different name from the original one.  \n",
    "In this example, the modified data schema is saved as XXX_mdf.xlsx"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### infer schema based on base_schema\n",
    "Since we have already generated data schema for data_2016, we can use the **modified** data schema of data_2016 to infer the schema for data_2017. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_2016_schema = pd.read_excel('output/data_schema_properties_2016_mdf.xlsx')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>column</th>\n",
       "      <th>type</th>\n",
       "      <th>include</th>\n",
       "      <th>sample_value</th>\n",
       "      <th>sample_num_uni</th>\n",
       "      <th>sample_uni_percentage</th>\n",
       "      <th>sample_min</th>\n",
       "      <th>sample_median</th>\n",
       "      <th>sample_max</th>\n",
       "      <th>sample_std</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>parcelid</td>\n",
       "      <td>key</td>\n",
       "      <td>1</td>\n",
       "      <td>[11238357, 12727866, 12914860, 12397429, 12616...</td>\n",
       "      <td>2985217</td>\n",
       "      <td>1.00000</td>\n",
       "      <td>10711725.0</td>\n",
       "      <td>12545094.0</td>\n",
       "      <td>169601949.0</td>\n",
       "      <td>7.909965e+06</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>airconditioningtypeid</td>\n",
       "      <td>str</td>\n",
       "      <td>1</td>\n",
       "      <td>[1.0, 1.0, 1.0, 1.0, 1.0]</td>\n",
       "      <td>7</td>\n",
       "      <td>0.00001</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>13.0</td>\n",
       "      <td>3.148585e+00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>architecturalstyletypeid</td>\n",
       "      <td>str</td>\n",
       "      <td>1</td>\n",
       "      <td>[7.0, 7.0, 7.0, 7.0, 8.0]</td>\n",
       "      <td>8</td>\n",
       "      <td>0.00132</td>\n",
       "      <td>2.0</td>\n",
       "      <td>7.0</td>\n",
       "      <td>27.0</td>\n",
       "      <td>2.436090e+00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>basementsqft</td>\n",
       "      <td>numeric</td>\n",
       "      <td>1</td>\n",
       "      <td>[676.0, 130.0, 264.0, 1056.0, 120.0]</td>\n",
       "      <td>751</td>\n",
       "      <td>0.46130</td>\n",
       "      <td>20.0</td>\n",
       "      <td>534.0</td>\n",
       "      <td>8516.0</td>\n",
       "      <td>5.386280e+02</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>bathroomcnt</td>\n",
       "      <td>numeric</td>\n",
       "      <td>1</td>\n",
       "      <td>[2.0, 1.0, 2.0, 2.0, 4.0]</td>\n",
       "      <td>37</td>\n",
       "      <td>0.00001</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>20.0</td>\n",
       "      <td>1.077754e+00</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                     column     type  include  \\\n",
       "0                  parcelid      key        1   \n",
       "1     airconditioningtypeid      str        1   \n",
       "2  architecturalstyletypeid      str        1   \n",
       "3              basementsqft  numeric        1   \n",
       "4               bathroomcnt  numeric        1   \n",
       "\n",
       "                                        sample_value  sample_num_uni  \\\n",
       "0  [11238357, 12727866, 12914860, 12397429, 12616...         2985217   \n",
       "1                          [1.0, 1.0, 1.0, 1.0, 1.0]               7   \n",
       "2                          [7.0, 7.0, 7.0, 7.0, 8.0]               8   \n",
       "3               [676.0, 130.0, 264.0, 1056.0, 120.0]             751   \n",
       "4                          [2.0, 1.0, 2.0, 2.0, 4.0]              37   \n",
       "\n",
       "   sample_uni_percentage  sample_min  sample_median   sample_max    sample_std  \n",
       "0                1.00000  10711725.0     12545094.0  169601949.0  7.909965e+06  \n",
       "1                0.00001         1.0            1.0         13.0  3.148585e+00  \n",
       "2                0.00132         2.0            7.0         27.0  2.436090e+00  \n",
       "3                0.46130        20.0          534.0       8516.0  5.386280e+02  \n",
       "4                0.00001         0.0            2.0         20.0  1.077754e+00  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_2016_schema.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wall time: 45.5 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "infer_schema.infer_schema(data=data_2017, fname='properties_2017_sample', output_root='output/', \n",
    "                          sample_size=0.1, type_threshold=0.5, n_jobs=2, base_schema=data_2016_schema)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# data_summary\n",
    "generate data summary report based on the **modified** data schema"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_2016_schema = pd.read_excel('output/data_schema_properties_2016_mdf.xlsx')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>column</th>\n",
       "      <th>type</th>\n",
       "      <th>include</th>\n",
       "      <th>sample_value</th>\n",
       "      <th>sample_num_uni</th>\n",
       "      <th>sample_uni_percentage</th>\n",
       "      <th>sample_min</th>\n",
       "      <th>sample_median</th>\n",
       "      <th>sample_max</th>\n",
       "      <th>sample_std</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>parcelid</td>\n",
       "      <td>key</td>\n",
       "      <td>1</td>\n",
       "      <td>[11238357, 12727866, 12914860, 12397429, 12616...</td>\n",
       "      <td>2985217</td>\n",
       "      <td>1.00000</td>\n",
       "      <td>10711725.0</td>\n",
       "      <td>12545094.0</td>\n",
       "      <td>169601949.0</td>\n",
       "      <td>7.909965e+06</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>airconditioningtypeid</td>\n",
       "      <td>str</td>\n",
       "      <td>1</td>\n",
       "      <td>[1.0, 1.0, 1.0, 1.0, 1.0]</td>\n",
       "      <td>7</td>\n",
       "      <td>0.00001</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>13.0</td>\n",
       "      <td>3.148585e+00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>architecturalstyletypeid</td>\n",
       "      <td>str</td>\n",
       "      <td>1</td>\n",
       "      <td>[7.0, 7.0, 7.0, 7.0, 8.0]</td>\n",
       "      <td>8</td>\n",
       "      <td>0.00132</td>\n",
       "      <td>2.0</td>\n",
       "      <td>7.0</td>\n",
       "      <td>27.0</td>\n",
       "      <td>2.436090e+00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>basementsqft</td>\n",
       "      <td>numeric</td>\n",
       "      <td>1</td>\n",
       "      <td>[676.0, 130.0, 264.0, 1056.0, 120.0]</td>\n",
       "      <td>751</td>\n",
       "      <td>0.46130</td>\n",
       "      <td>20.0</td>\n",
       "      <td>534.0</td>\n",
       "      <td>8516.0</td>\n",
       "      <td>5.386280e+02</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>bathroomcnt</td>\n",
       "      <td>numeric</td>\n",
       "      <td>1</td>\n",
       "      <td>[2.0, 1.0, 2.0, 2.0, 4.0]</td>\n",
       "      <td>37</td>\n",
       "      <td>0.00001</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>20.0</td>\n",
       "      <td>1.077754e+00</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                     column     type  include  \\\n",
       "0                  parcelid      key        1   \n",
       "1     airconditioningtypeid      str        1   \n",
       "2  architecturalstyletypeid      str        1   \n",
       "3              basementsqft  numeric        1   \n",
       "4               bathroomcnt  numeric        1   \n",
       "\n",
       "                                        sample_value  sample_num_uni  \\\n",
       "0  [11238357, 12727866, 12914860, 12397429, 12616...         2985217   \n",
       "1                          [1.0, 1.0, 1.0, 1.0, 1.0]               7   \n",
       "2                          [7.0, 7.0, 7.0, 7.0, 8.0]               8   \n",
       "3               [676.0, 130.0, 264.0, 1056.0, 120.0]             751   \n",
       "4                          [2.0, 1.0, 2.0, 2.0, 4.0]              37   \n",
       "\n",
       "   sample_uni_percentage  sample_min  sample_median   sample_max    sample_std  \n",
       "0                1.00000  10711725.0     12545094.0  169601949.0  7.909965e+06  \n",
       "1                0.00001         1.0            1.0         13.0  3.148585e+00  \n",
       "2                0.00132         2.0            7.0         27.0  2.436090e+00  \n",
       "3                0.46130        20.0          534.0       8516.0  5.386280e+02  \n",
       "4                0.00001         0.0            2.0         20.0  1.077754e+00  "
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_2016_schema.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wall time: 55.2 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "data_summary.data_summary(table_schema=data_2016_schema, table=data_2016, fname='properties_2016', \n",
    "                          sample_size=1.0, output_root='output/', keep_images=False, n_jobs=2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### generate data summary notebook\n",
    "if you want to do further checking based on the basic methods provided by **data_summary**, you can generate a notebook."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_summary.data_summary_notebook(table_schema=data_2016_schema, table=data_2016, \n",
    "                                   fname='properties_2016', output_root='output/')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# data compare"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_2017_schema = pd.read_excel('output/data_schema_properties_2017_sample_mdf.xlsx')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wall time: 2min 3s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "data_compare.data_compare(table1=data_2016, table2=data_2017, schema1=data_2016_schema, schema2=data_2017_schema,\n",
    "                          fname='properties_2016', sample_size=1.0, output_root='output/', keep_images=False, n_jobs=2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### generate data compare notebook"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_compare.data_compare_notebook(table1=data_2016, table2=data_2017, schema1=data_2016_schema, schema2=data_2017_schema,\n",
    "                                   fname='properties_2016', output_root='output/')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# data consist"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "data_consist.data_consist(table1=data_2016, table2=data_2017, key1='parcelid', key2='parcelid',\n",
    "                          schema1=data_2016_schema, schema2=data_2017_schema,\n",
    "                          fname='properties_2016', sample_size=1.0, output_root='output/', keep_images=False, n_jobs=2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_consist.data_consist_notebook(table1=data_2016, table2=data_2017, key1='parcelid', key2='parcelid',\n",
    "                                   schema1=data_2016_schema, schema2=data_2017_schema,\n",
    "                                   fname='properties_2016', output_root='output/')"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
